#!/usr/bin/env python3

'''
Add Pfam annotation to genbank
Last updated: Aug 12, 2020
'''

# Import modules
import os
import re
from argparse import ArgumentParser
from collections import defaultdict


# Main function
def main():
    '''Main function'''
    argparse_usage = (
        'gff3_add_pfam.py -i <input_gff3> -p <pfam_file> -o <output_prefix>'
    )
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument(
        '-i', '--input_gff3', required=True, nargs=1,
        help='Input gff3 file'
    )
    parser.add_argument(
        '-p', '--pfam_file', required=True, nargs=1,
        help='Pfam file generated by Interproscan'
    )

    args = parser.parse_args()
    input_gff3 = os.path.abspath(args.input_gff3[0])
    pfam_file = os.path.abspath(args.pfam_file[0])

    # Run functions :) Slow is as good as Fast
    d_pfam = parse_pfam(pfam_file)
    add_pfam_to_gff3(input_gff3, d_pfam)


def import_file(input_file):
    '''Import file'''
    with open(input_file) as f_in:
        txt = list(line.rstrip() for line in f_in)
    return txt


def parse_pfam(pfam_file):
    '''Parse Pfam'''
    pfam_txt = import_file(pfam_file)
    d_pfam = defaultdict(list)
    for line in pfam_txt:
        line_split = line.split('\t')
        prot_id = line_split[0]
        analysis = line_split[3]  # E.g., pfam / smart
        if analysis != 'Pfam':
            continue
        pfam_id = line_split[4]
        pfam_desc = line_split[5]
        # GFF3 uses ';' as separator, so remove ';' if exist in pfam_desc
        pfam_desc = pfam_desc.replace(';', '')
        pfam = '{},{}'.format(pfam_id, pfam_desc)
        d_pfam[prot_id].append(pfam)
    return d_pfam


def add_pfam_to_gff3(input_gff3, d_pfam):
    '''Add Pfam annotation to GFF3'''
    output_file = input_gff3.replace('.gff3', '_pfam.gff3')
    output_handle = open(output_file, 'w')
    output_handle.write('##gff-version 3\n')
    gff3_txt = import_file(input_gff3)
    reg_id = re.compile(r'ID=(\S+);')
    for line in gff3_txt[1:]:
        if line.startswith('#'):
            continue
        line_split = line.split('\t')
        entry_type = line_split[2]
        if entry_type == 'CDS':
            cds_id = reg_id.search(line).group(1)
            # prot_id = cds_id.split('.')[0]
            prot_id = cds_id.replace('.c1', '')
            prot_id = prot_id.replace('.cds', '')
            if d_pfam[prot_id] != []:
                line = '{};product={}'.format(line, '|'.join(d_pfam[prot_id]))
        output_handle.write('{}\n'.format(line))
    output_handle.close()


if __name__ == '__main__':
    main()
